********************************************************************************
* Finds strongly or weakly connected set of employers.
*
********************************************************************************
********************************************************************************
* PREPARE
********************************************************************************
* start log
cap log close connected
log using "${LOG_DIR}/log_connected_${ext}.log", text name(connected) replace

* macros
if $connect_by_categ local categ_str = "_${categ_var}"
else local categ_str = ""
if $connected_default local ext_str = ""
else local ext_str = "_${ext}"
if $connect_by_categ local var_categ = "${categ_var}"
else local var_categ = ""
if $min_hire_UE | $min_hire_tot local var_age = "age"
else local var_age = ""




********************************************************************************
* LOAD AND CLEAN DATA
********************************************************************************
* call user-defined function to load data
do "${DO_DIR}/FUN_LOAD.do" $year_min $year_max "${vars_list} hire_month sep_month id_unique" // variable list "${vars_list}" defined in VALUE_0_MASTER.do
// unique job identifier across years
egen double xxx = group(year id_unique)
cap recast double id_unique
replace id_unique = xxx
drop xxx

* rename variables
rename ${income_var} inc

* keep only relevant variables
keep year ${empid_var} persid hours_year inc `var_categ' `var_age' hire_month sep_month id_unique // i.e., drop edu ind07_5 muni hours occ02_6 tenure exp_act

* make sure all variables have nonmissing values
foreach var of varlist * {
	assert `var' < .
}

* save temp file
prog_desc_sum_comp_save "${TEMP_DIR}/temp_connected_${ext}.dta"




********************************************************************************
* CONNECTED SET FOR AKM
********************************************************************************
* load data
keep year ${empid_var} persid hours_year inc `var_categ' `var_age' id_unique // i.e., drop hire_month sep_month

* keep only highest-paid among longest jobs per worker-year. AKM selection criterion -- consider this the main job per person-year.
set seed 123
gen double rand = runiform()
bys persid year (hours_year inc rand): keep if _n == _N // Note: use random noise with fixed initial seed in order for this to yield a deterministic outcome.
drop hours_year inc rand

* keep only nonsingleton observations. AKM selection criterion -- for singletons, AKM worker and/or employer fixed effects are not well defined.
if $drop_singletons { // if sample selection criteria includes restriction to nonsingletons
	local singletons = 1
	while `singletons' {
		qui count
		local N_before = r(N)
		bys ${empid_var} `var_categ': keep if _N > 1 // keep if (${categ_var}-specific) employer has more than one employee across years.
		bys persid: keep if _N > 1 // keep if worker appears more than once across years.
		qui count
		local N_after = r(N)
		local N_dropped = `N_before' - `N_after'
		local N_dropped_perc : di %5.2f 100*`N_dropped'/`N_before'
		disp "--> dropped `N_dropped' singleton observations (`N_dropped_perc'%), N_before = `N_before', N_after = `N_after'."
		if `N_before' == `N_after' local singletons = 0
	}
}

* keep only employers with more than a minimum number of workers. AKM selection criterion -- assumption made by Sorkin (2018, QJE) is >= 15 employees/year on average.
if $min_size_emp > 1 {
	if ($min_size_emp_nonsingletons & $min_size_emp_years == 3) disp as error "USER WARNING: Nonsingleton and minimum employer size selections are such that all observations in year ${year_max} are dropped!"
	if $min_size_emp_nonsingletons { // 
		bys persid (year): gen byte nonsingleton = (persid[_n + 1] < .)
		if inlist($min_size_emp_years, 0, 1, 2) bys ${empid_var} `var_categ': ${gtools}egen double N = total(nonsingleton) // if minimum employer size should hold in data pooled across years ($min_size_emp_years = 0) OR if minimum employer size should hold on average in each year across pooled years ($min_size_emp_years = 1)
		else if $min_size_emp_years == 3 bys ${empid_var} `var_categ' year: ${gtools}egen double N = total(nonsingleton) // if minimum employer size should hold in each year
		drop nonsingleton
	}
	else {
		if inlist($min_size_emp_years, 0, 1, 2) bys ${empid_var} `var_categ': gen double N = _N
		else if $min_size_emp_years == 3 bys ${empid_var} `var_categ' year: gen double N = _N
	}
	if $min_size_emp_years == 0 keep if N >= ${min_size_emp}
	else if $min_size_emp_years == 1 keep if N >= ${min_size_emp}*(${year_max} - ${year_min} + 1)
	else if $min_size_emp_years == 2 {
		bys ${empid_var} `var_categ' year: gen byte emp_year = 1 if _n == 1
		bys ${empid_var} `var_categ': ${gtools}egen byte emp_year_total = total(emp_year)
		drop emp_year
		keep if N >= ${min_size_emp}*emp_year_total
		drop emp_year_total
	}
	else if $min_size_emp_years == 3 keep if N >= ${min_size_emp}
	drop N
}

* keep only employers that are in the data for more than a minimum number of years
if $min_emp_years > 1 {
	bys ${empid_var} `var_categ' year: gen byte emp_year = 1 if _n == 1
	bys ${empid_var} `var_categ': ${gtools}egen byte emp_year_total = total(emp_year)
	drop emp_year
	keep if emp_year_total >= ${min_emp_years}
	drop emp_year_total
}

* save complete data
prog_desc_sum_comp_save "${TEMP_DIR}/temp_selection`ext_str'.dta"

* keep only variables necessary to uniquely identify employers and jobs
keep ${empid_var} `var_categ' id_unique // i.e., drop year persid hours_year inc `var_age'. Note: in yearly panel, id_unique automatically identifies year
sort ${empid_var} `var_categ' id_unique

* save list of employer IDs and unique job IDs satisfying the selection criteria
prog_desc_sum_comp_save "${TEMP_DIR}/connected_akm_workers`categ_str'_${year_min}_${year_max}`ext_str'.dta"

* load complete data
use year ${empid_var} persid `var_categ' using "${TEMP_DIR}/temp_selection`ext_str'.dta", clear // i.e., do not load hours_year inc `var_age' id_unique
rm "${TEMP_DIR}/temp_selection`ext_str'.dta"

* call user-defined function to compute connected set
do "${DO_DIR}/FUN_CONNECTED.do" "year" "${empid_var}" "persid"

* order and sort variables
order ${empid_var} `var_categ'
sort ${empid_var} `var_categ'

* save connected set of employer IDs
prog_desc_sum_comp_save "${TEMP_DIR}/connected_akm`categ_str'_${year_min}_${year_max}`ext_str'.dta"

* trim down list of unique job IDs by removing employer IDs outside of (${categ_var}-specific) largest connected set(s)
use "${TEMP_DIR}/connected_akm_workers`categ_str'_${year_min}_${year_max}`ext_str'.dta", clear
if $connect_by_categ merge m:1 ${empid_var} ${categ_var} using "${TEMP_DIR}/connected_akm_${categ_var}_${year_min}_${year_max}`ext_str'.dta", keep(match) keepusing(${empid_var} ${categ_var}) nogen
else merge m:1 ${empid_var} using "${TEMP_DIR}/connected_akm_${year_min}_${year_max}`ext_str'.dta", keep(match) keepusing(${empid_var}) nogen

* keep only variables relevant for connected set of unique job IDs
keep id_unique // i.e., drop ${empid_var} `var_categ'
sort id_unique

* save connected set of worker IDs
prog_desc_sum_comp_save "${TEMP_DIR}/connected_akm_workers`categ_str'_${year_min}_${year_max}`ext_str'.dta"


********************************************************************************
* CONNECTED SET FOR PAGERANK
********************************************************************************
if $connected_pagerank_monthly { // if want to use monthly worker flows to construct PageRank, then construct connected set here
	
	* process data in batches
	forval b = 1/$n_batches {
		if $n_batches > 1 disp "--> data batch `b' out of ${n_batches}"
		
		* load data
		use persid ${empid_var} `var_categ' `var_age' year hours_year inc hire_month sep_month id_unique if mod(persid, $n_batches) == `b' - 1 using "${TEMP_DIR}/temp_connected_${ext}.dta", clear // i.e., load all variables
		
		* construct earnings in each month the job is active
		recode sep_month (0=13)
		forval m = 1/12 {
			gen float inc`m' = inc if inrange(`m', hire_month, sep_month)
			compress inc`m'
		}
		drop hire_month sep_month

		* reshape data to long format
		bys persid ${empid_var} year (inc): gen double n = _n
		compress n
		drop inc
		reshape long inc, i(persid ${empid_var} year n) j(month) // Note: could use ${gtools}, but risks running into error, since "A Stata bug prevents gtools from working with more than 2,147,483,647 observations."
		drop n
		compress

		* create variable containing year-month combination
		gen int year_month = ym(year, month)
		drop year month
		format year_month %tm
		compress year_month

		* keep only highest-paid among longest jobs per worker-month
		keep if hours_year < . & inc < . // drops year-month combinations when individual was not employed in RAIS
		set seed 123
		gen double rand = runiform()
		bys persid year_month (hours_year inc rand): keep if _n == _N // Note: use random noise with fixed initial seed in order for this to yield a deterministic outcome.
		
		* keep only relevant variables
		keep year_month ${empid_var} persid `var_categ' `var_age' id_unique // i.e., drop hours_year inc rand
		
		* make selections based on hiring requirements
		if $min_hire_UE | $min_hire_tot {
			
			* find earliest year-month combination that each worker appears in data
			bys persid: egen int year_month_min = min(year_month)
			
			* set panel
			xtset persid year_month
			
			* create indicator for new hires from unemployment (i.e., from outside of RAIS)
			if $min_hire_UE gen byte UE = (L.persid == .) if year_month > year_month_min | age > ${age_min}

			* create indicator for new hires
			if $min_hire_tot gen byte hire = (${empid_var} != L.${empid_var}) if year_month > year_month_min | age > ${age_min}
			drop year_month_min age
			
			* keep only employers who hired more than a minimum number of workers from nonemployment. Labor market parameter estimation selection criterion -- guarantees that employer has strictly positive sampling rate when estimating job offer cdf F(.) and pdf f(.). Note: Sorkin (2018, QJE) assumes >= 1.
			if $min_hire_UE {
				bys ${empid_var} `var_categ': egen double UE_tot = total(UE)
				keep if UE_tot >= ${min_hire_UE}
				drop UE UE_tot
			}

			* keep only employers who hired more than a minimum number of workers. Poaching Rank estimation selection criterion -- contributes to poaching rank being measured without too much noise. Note: Bagger and Lentz (2019, REStud) assume >= 15.
			if $min_hire_tot {
				bys ${empid_var} `var_categ': egen double hire_tot = total(hire)
				keep if hire_tot >= ${min_hire_tot}
				drop hire hire_tot
			}
		}
		
		* save data batch
		if $n_batches > 1 save "${TEMP_DIR}/temp_connected_batch_`b'.dta", replace
	}
	
	* append data batches
	if $n_batches > 1 {
		clear
		forval b = 1/$n_batches {
			append using "${TEMP_DIR}/temp_connected_batch_`b'.dta"
			rm "${TEMP_DIR}/temp_connected_batch_`b'.dta"
		}
	}
	
	* save complete data
	prog_desc_sum_comp_save "${TEMP_DIR}/temp_selection`ext_str'.dta"
	
	* keep only variables necessary to uniquely identify jobs and associated (${categ_var}-specific) firm IDs in each year-month combination
	keep year_month ${empid_var} `var_categ' id_unique // i.e., drop persid. Note: in monthly panel, id_unique does not automatically identify year, hence also need year_month
	sort year_month ${empid_var} `var_categ' id_unique

	* save list of employer IDs and unique job IDs satisfying the selection criteria
	prog_desc_sum_comp_save "${TEMP_DIR}/connected_pagerank_workers`categ_str'_${year_min}_${year_max}`ext_str'.dta"

	* load complete data
	use year_month ${empid_var} persid `var_categ' using "${TEMP_DIR}/temp_selection`ext_str'.dta", clear // i.e., do not load id_unique
	rm "${TEMP_DIR}/temp_selection`ext_str'.dta"
	
	* call user-defined function to compute connected set
	do "${DO_DIR}/FUN_CONNECTED.do" "year_month" "${empid_var}" "persid"
	
	* order and sort variables
	order ${empid_var} `var_categ'
	sort ${empid_var} `var_categ'

	* save connected set of firm-${categ_var} IDs
	prog_desc_sum_comp_save "${TEMP_DIR}/connected_pagerank`categ_str'_${year_min}_${year_max}`ext_str'.dta"
	
	* trim down list of unique job IDs by removing employer IDs outside of largest connected set(s)
	use "${TEMP_DIR}/connected_pagerank_workers`categ_str'_${year_min}_${year_max}`ext_str'.dta", clear
	if $connect_by_categ merge m:1 ${empid_var} ${categ_var} using "${TEMP_DIR}/connected_pagerank_${categ_var}_${year_min}_${year_max}`ext_str'.dta", keep(match) keepusing(${empid_var} ${categ_var}) nogen
	else merge m:1 ${empid_var} using "${TEMP_DIR}/connected_pagerank_${year_min}_${year_max}`ext_str'.dta", keep(match) keepusing(${empid_var}) nogen

	* keep only variables relevant for connected set of unique job IDs
	keep id_unique year_month // i.e., drop ${empid_var} `var_categ'
	sort id_unique year_month
	order id_unique year_month
	
	* keep only one observation per job ID (since there might be multiple ones in the expanded monthly long-format data)
	bys id_unique year_month: keep if _n == 1

	* save connected set of worker IDs
	prog_desc_sum_comp_save "${TEMP_DIR}/connected_pagerank_workers`categ_str'_${year_min}_${year_max}`ext_str'.dta"
}
else { // else, if want to use yearly worker flows to construct PageRank, then use AKM connected set
	use "${TEMP_DIR}/connected_akm_workers`categ_str'_${year_min}_${year_max}`ext_str'.dta", clear
	prog_desc_sum_comp_save "${TEMP_DIR}/connected_pagerank_workers`categ_str'_${year_min}_${year_max}`ext_str'.dta"
	use "${TEMP_DIR}/connected_akm`categ_str'_${year_min}_${year_max}`ext_str'.dta", clear
	prog_desc_sum_comp_save "${TEMP_DIR}/connected_pagerank`categ_str'_${year_min}_${year_max}`ext_str'.dta"
}

* remove temporary file
rm "${TEMP_DIR}/temp_connected_${ext}.dta"




********************************************************************************
* FORM INTERSECTION OF CONNECTED SETS
********************************************************************************
* employer-level connected sets
use "${TEMP_DIR}/connected_akm`categ_str'_${year_min}_${year_max}`ext_str'.dta", clear
if $connect_by_categ merge m:1 ${empid_var} ${categ_var} using "${TEMP_DIR}/connected_pagerank_${categ_var}_${year_min}_${year_max}`ext_str'.dta", keep(match) keepusing(${empid_var} ${categ_var}) nogen
else merge m:1 ${empid_var} using "${TEMP_DIR}/connected_pagerank_${year_min}_${year_max}`ext_str'.dta", keep(match) keepusing(${empid_var}) nogen
prog_desc_sum_comp_save "${TEMP_DIR}/connected_both`categ_str'_${year_min}_${year_max}.dta"

* worker-level connected sets
if $connect_by_categ use "${TEMP_DIR}/connected_pagerank_workers_${categ_var}_${year_min}_${year_max}`ext_str'.dta", clear
else use "${TEMP_DIR}/connected_pagerank_workers_${year_min}_${year_max}`ext_str'.dta", clear
merge m:1 id_unique using "${TEMP_DIR}/connected_akm_workers`categ_str'_${year_min}_${year_max}`ext_str'.dta", keep(match) keepusing(id_unique) nogen
prog_desc_sum_comp_save "${TEMP_DIR}/connected_both_workers`categ_str'_${year_min}_${year_max}.dta"




********************************************************************************
* FINAL HOUSEKEEPING
********************************************************************************
* close log
log close connected
